# === Data Understanding ===
## ==== Step 0: Load Necessary Libraries ====
cat("\n==== Step 0: Load Necessary Libraries ====\n")
##
## ==== Step 0: Load Necessary Libraries ====
# List of all required libraries
required_libraries <- c(
"tidyr", # Data tidying
"ggplot2", # Visualization
"dplyr", # Data manipulation
"caret", # Machine learning and feature selection
"randomForest", # Random Forest implementation
"scales", # Scaling and formatting in plots
"reshape2", # Data reshaping
"glue", # String interpolation
"moments", # Skewness and kurtosis calculations
"data.table", # Efficient data handling
"RColorBrewer", # Color palettes
"patchwork" # Combining ggplot objects
)
# Function to check, install, and load libraries
load_library <- function(package) {
if (!requireNamespace(package, quietly = TRUE)) { # Check if the package is installed
cat(sprintf("Installing missing library: %s\n", package))
install.packages(package, dependencies = TRUE) # Install the package if missing
}
library(package, character.only = TRUE) # Load the package
}
# Apply the function to all required libraries
invisible(lapply(required_libraries, load_library))
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
## Loading required package: lattice
## randomForest 4.7-1.2
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:dplyr':
##
## combine
## The following object is masked from 'package:ggplot2':
##
## margin
##
## Attaching package: 'reshape2'
## The following object is masked from 'package:tidyr':
##
## smiths
##
## Attaching package: 'data.table'
## The following objects are masked from 'package:reshape2':
##
## dcast, melt
## The following objects are masked from 'package:dplyr':
##
## between, first, last
cat("All required libraries are loaded.\n")
## All required libraries are loaded.
# --- Step 1: Load Dataset ---
cat("\n==== Step 1: Load Dataset ====\n")
##
## ==== Step 1: Load Dataset ====
data <- tryCatch(
{
read.csv("car_price.csv")
},
error = function(e) {
stop("Error loading dataset: ", e$message)
}
)
cat("Dataset loaded successfully.\n")
## Dataset loaded successfully.
# --- Step 2: Overview of Loaded Dataset ---
cat("\n==== Step 2: Overview of Loaded Dataset ====\n")
##
## ==== Step 2: Overview of Loaded Dataset ====
# Dataset dimensions
cat("\n---- Dataset Overview ----\n")
##
## ---- Dataset Overview ----
cat("Dataset Dimensions (Rows x Columns): ", dim(data), "\n")
## Dataset Dimensions (Rows x Columns): 215 26
cat("Column Names:\n", paste(names(data), collapse = ", "), "\n")
## Column Names:
## car_ID, symboling, CarName, fueltype, aspiration, doornumber, carbody, drivewheel, enginelocation, wheelbase, carlength, carwidth, carheight, curbweight, enginetype, cylindernumber, enginesize, fuelsystem, boreratio, stroke, compressionratio, horsepower, peakrpm, citympg, highwaympg, price
# Calculate and print dataset size in memory
data_size <- object.size(data)
cat("Approximate Data Size in Memory: ", format(data_size, units = "auto"), "\n")
## Approximate Data Size in Memory: 59.4 Kb
# Data types and structure
cat("\n---- Data Types and Structure ----\n")
##
## ---- Data Types and Structure ----
print(str(data))
## 'data.frame': 215 obs. of 26 variables:
## $ car_ID : num 1 2 3 4 5 6 7 8 9 NA ...
## $ symboling : num 3 3 1 2 2 2 1 1 1 NA ...
## $ CarName : chr "alfa-romero giulia" "alfa-romero stelvio" "alfa-romero Quadrifoglio" "audi 100 ls" ...
## $ fueltype : chr "gas" "gas" "gas" "gas" ...
## $ aspiration : chr "std" "std" "std" "std" ...
## $ doornumber : chr "two" "two" "two" "four" ...
## $ carbody : chr "convertible" "convertible" "hatchback" "sedan" ...
## $ drivewheel : chr "rwd" "rwd" "rwd" "fwd" ...
## $ enginelocation : chr "front" "front" "front" "front" ...
## $ wheelbase : num 88.6 88.6 94.5 99.8 99.4 ...
## $ carlength : num 169 169 171 177 177 ...
## $ carwidth : num 64.1 64.1 65.5 66.2 66.4 66.3 71.4 71.4 71.4 NA ...
## $ carheight : num 48.8 48.8 52.4 54.3 54.3 53.1 55.7 55.7 55.9 NA ...
## $ curbweight : num 2548 2548 2823 2337 2824 ...
## $ enginetype : chr "dohc" "dohc" "ohcv" "ohc" ...
## $ cylindernumber : chr "four" "four" "six" "four" ...
## $ enginesize : num 130 130 152 109 136 136 136 136 131 NA ...
## $ fuelsystem : chr "mpfi" "mpfi" "mpfi" "mpfi" ...
## $ boreratio : num 3.47 3.47 2.68 3.19 3.19 3.19 3.19 3.19 3.13 NA ...
## $ stroke : num 2.68 2.68 3.47 3.4 3.4 3.4 3.4 3.4 3.4 NA ...
## $ compressionratio: num 9 9 9 10 8 8.5 8.5 8.5 8.3 NA ...
## $ horsepower : num 111 111 154 102 115 110 110 110 140 NaN ...
## $ peakrpm : num 5000 5000 5000 5500 5500 5500 5500 5500 5500 NA ...
## $ citympg : num 21 21 19 24 18 19 19 19 17 NA ...
## $ highwaympg : num 27 27 26 30 22 25 25 25 20 NA ...
## $ price : num 13495 16500 16500 13950 17450 ...
## NULL
# --- Step 3: Convert Data Types ---
cat("\n==== Step 3: Convert IDs and Categorical Columns to Appropriate Types ====\n")
##
## ==== Step 3: Convert IDs and Categorical Columns to Appropriate Types ====
data$car_ID <- as.factor(data$car_ID) # Assuming car_ID is an identifier
data$fueltype <- as.factor(data$fueltype)
data$aspiration <- as.factor(data$aspiration)
data$doornumber <- as.factor(data$doornumber)
data$carbody <- as.factor(data$carbody)
data$drivewheel <- as.factor(data$drivewheel)
data$enginelocation <- as.factor(data$enginelocation)
data$enginetype <- as.factor(data$enginetype)
data$cylindernumber <- as.factor(data$cylindernumber)
data$fuelsystem <- as.factor(data$fuelsystem)
cat("Data types converted successfully.\n")
## Data types converted successfully.
# --- Step 4: Profile the Dataset ---
cat("\n==== Step 4: Profile the Dataset ====\n")
##
## ==== Step 4: Profile the Dataset ====
# Summary statistics
cat("\n---- Summary Statistics ----\n")
##
## ---- Summary Statistics ----
numeric_cols <- setdiff(names(data)[sapply(data, is.numeric)], "car_ID") # Exclude car_ID
categorical_cols <- setdiff(names(data)[sapply(data, is.factor)], "car_ID") # Exclude car_ID
cat("\n--- Summary Statistics for Numeric Columns ---\n")
##
## --- Summary Statistics for Numeric Columns ---
print(summary(data[numeric_cols]))
## symboling wheelbase carlength carwidth
## Min. :-2.0000 Min. : 86.60 Min. :144.6 Min. :61.80
## 1st Qu.: 0.0000 1st Qu.: 94.50 1st Qu.:166.3 1st Qu.:64.00
## Median : 1.0000 Median : 96.95 Median :173.2 Median :65.50
## Mean : 0.8595 Mean : 98.77 Mean :174.0 Mean :65.92
## 3rd Qu.: 2.0000 3rd Qu.:102.40 3rd Qu.:183.5 3rd Qu.:66.90
## Max. : 3.0000 Max. :115.60 Max. :202.6 Max. :72.30
## NA's :30 NA's :33 NA's :30 NA's :30
## carheight curbweight enginesize boreratio stroke
## Min. :47.8 Min. :1713 Min. :-500.0 Min. :2.540 Min. :2.070
## 1st Qu.:51.6 1st Qu.:2145 1st Qu.: 94.5 1st Qu.:3.150 1st Qu.:3.110
## Median :54.1 Median :2420 Median : 110.0 Median :3.310 Median :3.270
## Mean :53.7 Mean :2556 Mean : 99.9 Mean :3.325 Mean :3.255
## 3rd Qu.:55.5 3rd Qu.:2952 3rd Qu.: 143.0 3rd Qu.:3.580 3rd Qu.:3.410
## Max. :59.8 Max. :4066 Max. :2000.0 Max. :3.940 Max. :4.170
## NA's :30 NA's :30 NA's :28 NA's :32 NA's :30
## compressionratio horsepower peakrpm citympg
## Min. : 7.000 Min. :-50.00 Min. :4150 Min. :13.00
## 1st Qu.: 8.675 1st Qu.: 69.00 1st Qu.:4800 1st Qu.:19.00
## Median : 9.000 Median : 92.00 Median :5200 Median :24.00
## Mean : 67.985 Mean : 95.95 Mean :5142 Mean :25.15
## 3rd Qu.: 9.432 3rd Qu.:116.00 3rd Qu.:5500 3rd Qu.:30.00
## Max. :1000.000 Max. :288.00 Max. :6600 Max. :49.00
## NA's :27 NA's :27 NA's :30 NA's :30
## highwaympg price
## Min. :16.00 Min. :-1000
## 1st Qu.:25.00 1st Qu.: 7336
## Median :30.00 Median : 9984
## Mean :30.63 Mean :12451
## 3rd Qu.:34.00 3rd Qu.:16448
## Max. :54.00 Max. :45400
## NA's :30 NA's :27
cat("\n--- Summary Statistics for Categorical Columns ---\n")
##
## --- Summary Statistics for Categorical Columns ---
print(summary(data[categorical_cols]))
## fueltype aspiration doornumber carbody drivewheel
## : 27 : 30 : 30 :27 : 30
## diesel : 16 std :151 four:102 convertible: 6 4wd: 8
## gas :161 turbo: 34 two : 83 flyingcar :11 fwd:106
## unknown: 11 hardtop : 7 rwd: 71
## hatchback :59
## sedan :82
## wagon :23
## enginelocation enginetype cylindernumber fuelsystem
## : 30 ohc :132 : 30 mpfi :85
## front:182 : 30 eight : 4 2bbl :59
## rear : 3 ohcf : 13 five : 8 :30
## dohc : 12 four :145 idi :17
## ohcv : 12 six : 23 1bbl :11
## l : 11 twelve: 1 spdi : 8
## (Other): 5 two : 4 (Other): 5
# --- Step 5: Visualize Numeric Variables ---
cat("\n==== Step 5: Visualize Numeric Variables ====\n")
##
## ==== Step 5: Visualize Numeric Variables ====
plot_numeric_variable <- function(data, variable) {
cat(glue::glue("\nDisplaying histogram for numeric variable: {variable}\n"))
# Histogram
print(
ggplot(data, aes_string(x = variable)) +
geom_histogram(fill = "lightblue", color = "black", bins = 30) +
labs(title = paste("Histogram for", variable), x = variable, y = "Frequency") +
theme_minimal()
)
}
if (length(numeric_cols) > 0) {
lapply(numeric_cols, function(col) plot_numeric_variable(data, col))
}
## Displaying histogram for numeric variable: symboling
## Warning: `aes_string()` was deprecated in ggplot2 3.0.0.
## ℹ Please use tidy evaluation idioms with `aes()`.
## ℹ See also `vignette("ggplot2-in-packages")` for more information.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Warning: Removed 30 rows containing non-finite outside the scale range
## (`stat_bin()`).

## Displaying histogram for numeric variable: wheelbase
## Warning: Removed 33 rows containing non-finite outside the scale range
## (`stat_bin()`).

## Displaying histogram for numeric variable: carlength
## Warning: Removed 30 rows containing non-finite outside the scale range
## (`stat_bin()`).

## Displaying histogram for numeric variable: carwidth
## Warning: Removed 30 rows containing non-finite outside the scale range
## (`stat_bin()`).

## Displaying histogram for numeric variable: carheight
## Warning: Removed 30 rows containing non-finite outside the scale range
## (`stat_bin()`).

## Displaying histogram for numeric variable: curbweight
## Warning: Removed 30 rows containing non-finite outside the scale range
## (`stat_bin()`).

## Displaying histogram for numeric variable: enginesize
## Warning: Removed 28 rows containing non-finite outside the scale range
## (`stat_bin()`).

## Displaying histogram for numeric variable: boreratio
## Warning: Removed 32 rows containing non-finite outside the scale range
## (`stat_bin()`).

## Displaying histogram for numeric variable: stroke
## Warning: Removed 30 rows containing non-finite outside the scale range
## (`stat_bin()`).

## Displaying histogram for numeric variable: compressionratio
## Warning: Removed 27 rows containing non-finite outside the scale range
## (`stat_bin()`).

## Displaying histogram for numeric variable: horsepower
## Warning: Removed 27 rows containing non-finite outside the scale range
## (`stat_bin()`).

## Displaying histogram for numeric variable: peakrpm
## Warning: Removed 30 rows containing non-finite outside the scale range
## (`stat_bin()`).

## Displaying histogram for numeric variable: citympg
## Warning: Removed 30 rows containing non-finite outside the scale range
## (`stat_bin()`).

## Displaying histogram for numeric variable: highwaympg
## Warning: Removed 30 rows containing non-finite outside the scale range
## (`stat_bin()`).

## Displaying histogram for numeric variable: price
## Warning: Removed 27 rows containing non-finite outside the scale range
## (`stat_bin()`).

## [[1]]
## Warning: Removed 30 rows containing non-finite outside the scale range
## (`stat_bin()`).

##
## [[2]]
## Warning: Removed 33 rows containing non-finite outside the scale range
## (`stat_bin()`).

##
## [[3]]
## Warning: Removed 30 rows containing non-finite outside the scale range
## (`stat_bin()`).

##
## [[4]]
## Warning: Removed 30 rows containing non-finite outside the scale range
## (`stat_bin()`).

##
## [[5]]
## Warning: Removed 30 rows containing non-finite outside the scale range
## (`stat_bin()`).

##
## [[6]]
## Warning: Removed 30 rows containing non-finite outside the scale range
## (`stat_bin()`).

##
## [[7]]
## Warning: Removed 28 rows containing non-finite outside the scale range
## (`stat_bin()`).

##
## [[8]]
## Warning: Removed 32 rows containing non-finite outside the scale range
## (`stat_bin()`).

##
## [[9]]
## Warning: Removed 30 rows containing non-finite outside the scale range
## (`stat_bin()`).

##
## [[10]]
## Warning: Removed 27 rows containing non-finite outside the scale range
## (`stat_bin()`).

##
## [[11]]
## Warning: Removed 27 rows containing non-finite outside the scale range
## (`stat_bin()`).

##
## [[12]]
## Warning: Removed 30 rows containing non-finite outside the scale range
## (`stat_bin()`).

##
## [[13]]
## Warning: Removed 30 rows containing non-finite outside the scale range
## (`stat_bin()`).

##
## [[14]]
## Warning: Removed 30 rows containing non-finite outside the scale range
## (`stat_bin()`).

##
## [[15]]
## Warning: Removed 27 rows containing non-finite outside the scale range
## (`stat_bin()`).

# --- Step 6: Visualize Categorical Variables ---
cat("\n==== Step 6: Visualize Categorical Variables ====\n")
##
## ==== Step 6: Visualize Categorical Variables ====
plot_categorical_variable <- function(data, variable) {
cat(glue::glue("\nDisplaying barplot for categorical variable: {variable}\n"))
print(
ggplot(data, aes_string(x = variable)) +
geom_bar(fill = "skyblue", color = "darkblue") +
labs(title = paste("Barplot for", variable), x = variable, y = "Count") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
)
}
if (length(categorical_cols) > 0) {
lapply(categorical_cols, function(col) plot_categorical_variable(data, col))
}
## Displaying barplot for categorical variable: fueltype

## Displaying barplot for categorical variable: aspiration

## Displaying barplot for categorical variable: doornumber

## Displaying barplot for categorical variable: carbody

## Displaying barplot for categorical variable: drivewheel

## Displaying barplot for categorical variable: enginelocation

## Displaying barplot for categorical variable: enginetype

## Displaying barplot for categorical variable: cylindernumber

## Displaying barplot for categorical variable: fuelsystem

## [[1]]

##
## [[2]]

##
## [[3]]

##
## [[4]]

##
## [[5]]

##
## [[6]]

##
## [[7]]

##
## [[8]]

##
## [[9]]

# --- Reflection on Dataset Profiling ---
cat("\n==== Reflection on Dataset Profiling ====\n")
##
## ==== Reflection on Dataset Profiling ====
cat(glue("
1. **Dataset Size and Variables**:
- Rows: {dim(data)[1]}, Columns: {dim(data)[2]}.
- Approximate memory size: {format(data_size, units = 'auto')}.
2. **Variable Visualizations**:
- Histograms displayed for numeric variables (excluding car_ID).
- Bar plots displayed for categorical variables (excluding car_ID).
Profiling helps identify preprocessing needs, such as outlier handling, scaling, and encoding categorical features.
"))
## 1. **Dataset Size and Variables**:
## - Rows: 215, Columns: 26.
## - Approximate memory size: 59.4 Kb.
##
## 2. **Variable Visualizations**:
## - Histograms displayed for numeric variables (excluding car_ID).
## - Bar plots displayed for categorical variables (excluding car_ID).
##
## Profiling helps identify preprocessing needs, such as outlier handling, scaling, and encoding categorical features.
# === Data Preparation ===
# --- Step 1: Missing Values ---
# --- Sub-step 1.1: Identify Missing Values ---
cat("\n---- Sub-step 1.1: Identifying Missing Values ----\n")
##
## ---- Sub-step 1.1: Identifying Missing Values ----
# Calculate the number of missing values in each column
missing_values <- colSums(is.na(data))
cat("Missing values in each column:\n")
## Missing values in each column:
print(missing_values)
## car_ID symboling CarName fueltype
## 30 30 0 0
## aspiration doornumber carbody drivewheel
## 0 0 0 0
## enginelocation wheelbase carlength carwidth
## 0 33 30 30
## carheight curbweight enginetype cylindernumber
## 30 30 0 0
## enginesize fuelsystem boreratio stroke
## 28 0 32 30
## compressionratio horsepower peakrpm citympg
## 27 27 30 30
## highwaympg price
## 30 27
# Visualize Missing Values Before Handling
cat("\nVisualizing missing values (before handling)...\n")
##
## Visualizing missing values (before handling)...
missing_df <- data.frame(Column = names(missing_values), MissingCount = missing_values)
missing_df <- missing_df[missing_df$MissingCount > 0, ] # Filter columns with missing values
if (nrow(missing_df) > 0) {
# Bar plot for missing values before handling
missing_plot_before <- ggplot(missing_df, aes(x = reorder(Column, -MissingCount), y = MissingCount)) +
geom_bar(stat = "identity", fill = "skyblue", color = "darkblue") +
labs(
title = "Missing Values Per Column (Before Handling)",
x = "Column",
y = "Number of Missing Values"
) +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
print(missing_plot_before)
} else {
cat("No missing values found in the dataset.\n")
}

# --- Sub-step 1.2: Handle Missing Values in car_ID ---
cat("\n---- Sub-step 1.2: Handling Missing Values in car_ID ----\n")
##
## ---- Sub-step 1.2: Handling Missing Values in car_ID ----
# car_ID is a unique identifier, so rows with missing car_ID values should be removed.
if (anyNA(data$car_ID)) {
cat("Handling missing values in car_ID: Removing rows with missing car_ID values.\n")
data <- data[!is.na(data$car_ID), ] # Remove rows where car_ID is NA
cat("Rows with missing car_ID values removed.\n")
} else {
cat("No missing values found in car_ID.\n")
}
## Handling missing values in car_ID: Removing rows with missing car_ID values.
## Rows with missing car_ID values removed.
# --- Sub-step 1.3: Handle Numeric Missing Values (Median Imputation) ---
cat("\n---- Sub-step 1.3: Handling Numeric Missing Values ----\n")
##
## ---- Sub-step 1.3: Handling Numeric Missing Values ----
# Identify numeric columns
numeric_cols <- names(data)[sapply(data, is.numeric)]
# Loop through each numeric column and fill missing values with the median
for (col in numeric_cols) {
if (sum(is.na(data[[col]])) > 0) {
median_val <- median(data[[col]], na.rm = TRUE) # Calculate the median excluding NAs
data[[col]][is.na(data[[col]])] <- median_val # Replace missing values with the median
cat(sprintf("Filled missing values in numeric column '%s' with median: %.2f\n", col, median_val))
}
}
## Filled missing values in numeric column 'wheelbase' with median: 96.95
## Filled missing values in numeric column 'enginesize' with median: 110.00
## Filled missing values in numeric column 'boreratio' with median: 3.31
# --- Sub-step 1.4: Handle Categorical Missing Values (Mode Imputation) ---
cat("\n---- Sub-step 1.4: Handling Categorical Missing Values ----\n")
##
## ---- Sub-step 1.4: Handling Categorical Missing Values ----
# Identify categorical columns (character or factor)
categorical_cols <- names(data)[sapply(data, function(x) is.character(x) || is.factor(x))]
# Loop through each categorical column and fill missing values with the mode
for (col in categorical_cols) {
if (sum(is.na(data[[col]])) > 0) {
mode_val <- names(which.max(table(data[[col]], useNA = "no"))) # Find the mode
data[[col]][is.na(data[[col]])] <- mode_val # Replace missing values with the mode
cat(sprintf("Filled missing values in categorical column '%s' with mode: '%s'\n", col, mode_val))
}
}
# --- Sub-step 1.5: Verify and Visualize Post-Imputation ---
cat("\n---- Sub-step 1.5: Verifying Post-Imputation ----\n")
##
## ---- Sub-step 1.5: Verifying Post-Imputation ----
# Recalculate the number of missing values in each column
final_missing_values <- colSums(is.na(data))
cat("Remaining missing values in each column (should be 0):\n")
## Remaining missing values in each column (should be 0):
print(final_missing_values)
## car_ID symboling CarName fueltype
## 0 0 0 0
## aspiration doornumber carbody drivewheel
## 0 0 0 0
## enginelocation wheelbase carlength carwidth
## 0 0 0 0
## carheight curbweight enginetype cylindernumber
## 0 0 0 0
## enginesize fuelsystem boreratio stroke
## 0 0 0 0
## compressionratio horsepower peakrpm citympg
## 0 0 0 0
## highwaympg price
## 0 0
# Visualize Missing Values After Handling
cat("\nVisualizing missing values (after handling)...\n")
##
## Visualizing missing values (after handling)...
final_missing_df <- data.frame(Column = names(final_missing_values), MissingCount = final_missing_values)
final_missing_df <- final_missing_df[final_missing_df$MissingCount > 0, ]
if (nrow(final_missing_df) > 0) {
# Bar plot for missing values after handling
missing_plot_after <- ggplot(final_missing_df, aes(x = reorder(Column, -MissingCount), y = MissingCount)) +
geom_bar(stat = "identity", fill = "lightgreen", color = "darkgreen") +
labs(
title = "Missing Values Per Column (After Handling)",
x = "Column",
y = "Number of Missing Values"
) +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
print(missing_plot_after)
} else {
cat("All missing values have been successfully handled.\n")
}
## All missing values have been successfully handled.
# --- Reflection on Step 1: Missing Values ---
cat("\n==== Reflection on Step 1: Missing Values ====\n")
##
## ==== Reflection on Step 1: Missing Values ====
cat(
"1. **Identification**:\n",
"- Missing values were identified and visualized, allowing us to assess the extent of the issue.\n\n",
"2. **Imputation Strategies**:\n",
"- `car_ID`: Rows with missing values were removed as it is a unique identifier.\n",
"- Numeric columns: Median imputation was used to ensure robustness to outliers.\n",
"- Categorical columns: Mode imputation preserved the most common category.\n\n",
"3. **Post-Imputation Analysis**:\n",
"- All columns were rechecked to ensure no residual missing values.\n",
"- Visualizations confirmed the success of the missing value handling process.\n\n",
"By addressing missing values systematically, the dataset is now complete and ready for further preprocessing or modeling.\n"
)
## 1. **Identification**:
## - Missing values were identified and visualized, allowing us to assess the extent of the issue.
##
## 2. **Imputation Strategies**:
## - `car_ID`: Rows with missing values were removed as it is a unique identifier.
## - Numeric columns: Median imputation was used to ensure robustness to outliers.
## - Categorical columns: Mode imputation preserved the most common category.
##
## 3. **Post-Imputation Analysis**:
## - All columns were rechecked to ensure no residual missing values.
## - Visualizations confirmed the success of the missing value handling process.
##
## By addressing missing values systematically, the dataset is now complete and ready for further preprocessing or modeling.
# --- Step 2: Handle Duplicates ---
cat("\n==== Step 2: Handle Duplicates ====\n")
##
## ==== Step 2: Handle Duplicates ====
# Check for and remove duplicate rows
num_duplicates <- nrow(data) - nrow(dplyr::distinct(data))
cat(sprintf("Number of duplicate rows: %d\n", num_duplicates))
## Number of duplicate rows: 0
# Remove duplicates
data <- dplyr::distinct(data)
cat(sprintf("Number of rows after removing duplicates: %d\n", nrow(data)))
## Number of rows after removing duplicates: 185
# --- Reflection on Step 2: Handling Duplicates ---
cat("\n==== Reflection on Step 2: Handling Duplicates ====\n")
##
## ==== Reflection on Step 2: Handling Duplicates ====
cat(
"1. **Initial Check**:\n",
"- Identified and reported the number of duplicate rows: ", num_duplicates, ".\n\n",
"2. **Handling**:\n",
"- Removed all duplicate rows using `dplyr::distinct()`.\n",
"- Remaining rows after duplicate removal: ", nrow(data), ".\n\n",
"By addressing duplicates, the dataset is now free of redundant rows, ensuring consistency and accuracy for subsequent preprocessing.\n"
)
## 1. **Initial Check**:
## - Identified and reported the number of duplicate rows: 0 .
##
## 2. **Handling**:
## - Removed all duplicate rows using `dplyr::distinct()`.
## - Remaining rows after duplicate removal: 185 .
##
## By addressing duplicates, the dataset is now free of redundant rows, ensuring consistency and accuracy for subsequent preprocessing.
# --- Step 3: Handle Outliers ---
cat("\n==== Step 3: Handle Outliers ====\n")
##
## ==== Step 3: Handle Outliers ====
# --- Sub-step 3.1: Identify Outliers ---
cat("\n---- Sub-step 3.1: Identifying Outliers in Numerical Columns ----\n")
##
## ---- Sub-step 3.1: Identifying Outliers in Numerical Columns ----
# Function to calculate IQR bounds
identify_outliers <- function(column) {
Q1 <- quantile(column, 0.25, na.rm = TRUE)
Q3 <- quantile(column, 0.75, na.rm = TRUE)
IQR <- Q3 - Q1
list(lower = Q1 - 1.5 * IQR, upper = Q3 + 1.5 * IQR)
}
# Identify numeric columns
numeric_cols <- names(data)[sapply(data, is.numeric)]
# Store original data for comparison
data_original <- data
# --- Sub-step 3.2: Handle Outliers Using Capping ---
cat("\n---- Sub-step 3.2: Handling Outliers Using Capping ----\n")
##
## ---- Sub-step 3.2: Handling Outliers Using Capping ----
# Loop through numeric columns to cap outliers
for (col in numeric_cols) {
bounds <- identify_outliers(data[[col]])
outliers <- which(data[[col]] < bounds$lower | data[[col]] > bounds$upper)
if (length(outliers) > 0) {
data[[col]][outliers] <- pmin(pmax(data[[col]][outliers], bounds$lower), bounds$upper)
cat(sprintf("Column: %-15s | Outliers Adjusted: %-4d | Bounds: [%.2f, %.2f]\n",
col, length(outliers), bounds$lower, bounds$upper))
} else {
cat(sprintf("Column: %-15s | Outliers: None\n", col))
}
}
## Column: symboling | Outliers: None
## Column: wheelbase | Outliers Adjusted: 2 | Bounds: [82.65, 114.25]
## Column: carlength | Outliers: None
## Column: carwidth | Outliers Adjusted: 7 | Bounds: [59.65, 71.25]
## Column: carheight | Outliers: None
## Column: curbweight | Outliers: None
## Column: enginesize | Outliers Adjusted: 15 | Bounds: [25.00, 217.00]
## Column: boreratio | Outliers: None
## Column: stroke | Outliers Adjusted: 18 | Bounds: [2.66, 3.86]
## Column: compressionratio | Outliers Adjusted: 30 | Bounds: [7.40, 10.60]
## Column: horsepower | Outliers Adjusted: 14 | Bounds: [1.00, 185.00]
## Column: peakrpm | Outliers Adjusted: 2 | Bounds: [3750.00, 6550.00]
## Column: citympg | Outliers Adjusted: 1 | Bounds: [2.50, 46.50]
## Column: highwaympg | Outliers Adjusted: 2 | Bounds: [11.50, 47.50]
## Column: price | Outliers Adjusted: 13 | Bounds: [-6092.50, 30055.50]
# --- Sub-step 3.3: Compare Before and After Handling ---
cat("\n---- Sub-step 3.3: Comparing Before and After Outlier Handling ----\n")
##
## ---- Sub-step 3.3: Comparing Before and After Outlier Handling ----
for (col in numeric_cols) {
plot <- ggplot() +
geom_boxplot(data = data_original, aes(x = "Original Data", y = .data[[col]]),
outlier.colour = "red", fill = "lightblue", color = "darkblue") +
geom_boxplot(data = data, aes(x = "After Capping", y = .data[[col]]),
outlier.colour = "red", fill = "lightgreen", color = "darkblue") +
labs(title = paste("Comparison of", col, "- Before and After Outlier Handling"), x = "", y = col) +
theme_minimal() +
theme(plot.title = element_text(hjust = 0.5, face = "bold", size = 14)) +
scale_y_continuous(labels = scales::comma)
print(plot)
}















# --- Why Use Capping for Predicting Car Prices? ---
cat("\n---- Why Use Capping for Predicting Car Prices? ----\n")
##
## ---- Why Use Capping for Predicting Car Prices? ----
cat(
"Capping reduces the impact of outliers while retaining rare but valid cases (e.g., luxury cars),\n",
"ensuring the model learns from all data without losing critical information.\n"
)
## Capping reduces the impact of outliers while retaining rare but valid cases (e.g., luxury cars),
## ensuring the model learns from all data without losing critical information.
# --- Reflection on Step 3: Handling Outliers ---
cat("\n==== Reflection on Step 3: Handling Outliers ====\n")
##
## ==== Reflection on Step 3: Handling Outliers ====
cat(
"1. Outliers were detected using the IQR method and visualized with boxplots.\n",
"2. Capping limited extreme values' influence while preserving data integrity.\n",
"3. Boxplots showed the effectiveness of capping in reducing outlier impact.\n",
"This ensures the dataset is robust and ready for analysis.\n"
)
## 1. Outliers were detected using the IQR method and visualized with boxplots.
## 2. Capping limited extreme values' influence while preserving data integrity.
## 3. Boxplots showed the effectiveness of capping in reducing outlier impact.
## This ensures the dataset is robust and ready for analysis.
# --- Step 4: Scaling / Normalizing Features ---
#
# Scaling involves normalizing numeric features to a consistent range, often between 0 and 1.
# This step is critical for machine learning algorithms sensitive to feature magnitude, such as:
# - Gradient Descent-Based Models (e.g., Linear or Logistic Regression)
# - K-Nearest Neighbors (KNN)
# - Support Vector Machines (SVM)
# - Neural Networks
#
# Min-Max Scaling transforms each feature using the formula:
# Scaled Value = (Value - Min) / (Max - Min)
# This ensures that all features contribute equally during model training.
#
#
# # Identify numeric columns
# numeric_cols <- names(data)[sapply(data, is.numeric)]
#
# # Apply Min-Max Scaling to Numeric Columns
# cat("\n---- Scaling Numeric Columns ----\n")
# scaled_data <- data # Create a copy of the dataset for scaling
# scaled_data[numeric_cols] <- lapply(data[numeric_cols], function(col) {
# scaled_col <- (col - min(col, na.rm = TRUE)) / (max(col, na.rm = TRUE) - min(col, na.rm = TRUE))
# return(scaled_col)
# })
# cat("Features scaled using Min-Max scaling.\n")
#
# # Optional: Visualize Original vs. Scaled Data
# cat("\n---- Visualizing Original vs. Scaled Data (Optional) ----\n")
# for (col in numeric_cols) {
# plot <- ggplot() +
# # Original data distribution
# geom_density(data = data, aes_string(x = col), fill = "lightblue", alpha = 0.5, color = "darkblue") +
# # Scaled data distribution
# geom_density(data = scaled_data, aes_string(x = col), fill = "lightgreen", alpha = 0.5, color = "darkgreen") +
# labs(
# title = paste("Comparison of", col, "- Original vs. Scaled"),
# x = col,
# y = "Density"
# ) +
# theme_minimal() +
# theme(
# plot.title = element_text(hjust = 0.5, face = "bold", size = 14)
# )
# print(plot)
# }
# --- Reflection on Step 4: Scaling / Normalizing Features ---
cat("\n==== Reflection on Step 4: Scaling / Normalizing Features ====\n")
##
## ==== Reflection on Step 4: Scaling / Normalizing Features ====
cat(
"1. Scaling was skipped because the selected machine learning algorithms (e.g., Random Forest, Gradient Boosting)\n",
" are tree-based models that do not rely on feature scaling.\n",
"2. These models split data based on thresholds, making scaling unnecessary.\n",
"3. Skipping scaling ensures computational efficiency without compromising model performance.\n",
"4. If future models require distance-based methods (e.g., KNN or SVM), scaling can be revisited.\n"
)
## 1. Scaling was skipped because the selected machine learning algorithms (e.g., Random Forest, Gradient Boosting)
## are tree-based models that do not rely on feature scaling.
## 2. These models split data based on thresholds, making scaling unnecessary.
## 3. Skipping scaling ensures computational efficiency without compromising model performance.
## 4. If future models require distance-based methods (e.g., KNN or SVM), scaling can be revisited.
# --- Step 5: Data Transformation ---
cat("\n==== Step 5: Data Transformation ====\n")
##
## ==== Step 5: Data Transformation ====
# Identify numeric columns
numeric_cols <- names(data)[sapply(data, is.numeric)]
# Function to calculate skewness
library(moments) # Ensure the 'moments' package is installed
calculate_skewness <- function(column) {
skewness(column, na.rm = TRUE)
}
# Apply Log Transformation to Highly Skewed Numeric Columns
cat("\n---- Applying Log Transformation to Highly Skewed Columns ----\n")
##
## ---- Applying Log Transformation to Highly Skewed Columns ----
for (col in numeric_cols) {
skew_val <- calculate_skewness(data[[col]]) # Calculate skewness
cat(sprintf("Skewness of '%s': %.2f\n", col, skew_val))
if (skew_val > 1 && min(data[[col]], na.rm = TRUE) > 0) { # Apply if skewness > 1 and values are positive
data[[paste0(col, "_log")]] <- log(data[[col]])
cat(sprintf("Applied log transformation to '%s' (Skewness: %.2f).\n", col, skew_val))
} else {
cat(sprintf("Skipped log transformation for '%s' (Skewness: %.2f).\n", col, skew_val))
}
}
## Skewness of 'symboling': 0.15
## Skipped log transformation for 'symboling' (Skewness: 0.15).
## Skewness of 'wheelbase': 0.91
## Skipped log transformation for 'wheelbase' (Skewness: 0.91).
## Skewness of 'carlength': 0.16
## Skipped log transformation for 'carlength' (Skewness: 0.16).
## Skewness of 'carwidth': 0.88
## Skipped log transformation for 'carwidth' (Skewness: 0.88).
## Skewness of 'carheight': 0.06
## Skipped log transformation for 'carheight' (Skewness: 0.06).
## Skewness of 'curbweight': 0.66
## Skipped log transformation for 'curbweight' (Skewness: 0.66).
## Skewness of 'enginesize': 0.31
## Skipped log transformation for 'enginesize' (Skewness: 0.31).
## Skewness of 'boreratio': 0.02
## Skipped log transformation for 'boreratio' (Skewness: 0.02).
## Skewness of 'stroke': -0.38
## Skipped log transformation for 'stroke' (Skewness: -0.38).
## Skewness of 'compressionratio': 0.03
## Skipped log transformation for 'compressionratio' (Skewness: 0.03).
## Skewness of 'horsepower': 0.15
## Skipped log transformation for 'horsepower' (Skewness: 0.15).
## Skewness of 'peakrpm': 0.03
## Skipped log transformation for 'peakrpm' (Skewness: 0.03).
## Skewness of 'citympg': 0.54
## Skipped log transformation for 'citympg' (Skewness: 0.54).
## Skewness of 'highwaympg': 0.31
## Skipped log transformation for 'highwaympg' (Skewness: 0.31).
## Skewness of 'price': 0.92
## Skipped log transformation for 'price' (Skewness: 0.92).
# --- Visualize Original vs. Log Transformed Data ---
cat("\n---- Visualizing Original vs. Log Transformed Data ----\n")
##
## ---- Visualizing Original vs. Log Transformed Data ----
for (col in numeric_cols) {
log_col <- paste0(col, "_log")
if (log_col %in% names(data)) { # Ensure log-transformed column exists
plot <- ggplot() +
geom_density(data = data, aes_string(x = col), fill = "lightblue", alpha = 0.5, color = "darkblue") +
geom_density(data = data, aes_string(x = log_col), fill = "lightgreen", alpha = 0.5, color = "darkgreen") +
labs(
title = paste("Comparison of", col, "- Original vs. Log Transformed"),
x = col,
y = "Density"
) +
theme_minimal() +
theme(
plot.title = element_text(hjust = 0.5, face = "bold", size = 14)
)
print(plot)
}
}
# --- Reflection on Step 5: Data Transformation ---
cat("\n==== Reflection on Step 5: Data Transformation ====\n")
##
## ==== Reflection on Step 5: Data Transformation ====
cat(
"1. Log transformation was applied to numeric columns with high skewness (Skewness > 1) and positive values.\n",
"2. This reduces skewness, stabilizes variance, and improves linear relationships for better linear regression performance.\n",
"3. Columns with low skewness or non-positive values were skipped to avoid unnecessary transformations or errors.\n",
"4. Visualization confirmed the effectiveness of log transformation in normalizing distributions.\n"
)
## 1. Log transformation was applied to numeric columns with high skewness (Skewness > 1) and positive values.
## 2. This reduces skewness, stabilizes variance, and improves linear relationships for better linear regression performance.
## 3. Columns with low skewness or non-positive values were skipped to avoid unnecessary transformations or errors.
## 4. Visualization confirmed the effectiveness of log transformation in normalizing distributions.
# === Step 6: Feature Engineering ===
cat("\n==== Step 6: Feature Engineering ====\n")
##
## ==== Step 6: Feature Engineering ====
# --- Sub-step 6.1: Review Existing Features ---
cat("\n---- Reviewing Existing Features ----\n")
##
## ---- Reviewing Existing Features ----
# Display column names and summary statistics
cat("Current column names:\n")
## Current column names:
print(names(data))
## [1] "car_ID" "symboling" "CarName" "fueltype"
## [5] "aspiration" "doornumber" "carbody" "drivewheel"
## [9] "enginelocation" "wheelbase" "carlength" "carwidth"
## [13] "carheight" "curbweight" "enginetype" "cylindernumber"
## [17] "enginesize" "fuelsystem" "boreratio" "stroke"
## [21] "compressionratio" "horsepower" "peakrpm" "citympg"
## [25] "highwaympg" "price"
cat("\nSummary statistics:\n")
##
## Summary statistics:
print(summary(data))
## car_ID symboling CarName fueltype aspiration
## 1 : 1 Min. :-2.0000 Length:185 : 0 : 0
## 2 : 1 1st Qu.: 0.0000 Class :character diesel : 16 std :151
## 3 : 1 Median : 1.0000 Mode :character gas :161 turbo: 34
## 4 : 1 Mean : 0.8595 unknown: 8
## 5 : 1 3rd Qu.: 2.0000
## 6 : 1 Max. : 3.0000
## (Other):179
## doornumber carbody drivewheel enginelocation wheelbase
## : 0 : 0 : 0 : 0 Min. : 86.60
## four:102 convertible: 6 4wd: 8 front:182 1st Qu.: 94.50
## two : 83 flyingcar : 8 fwd:106 rear : 3 Median : 96.95
## hardtop : 7 rwd: 71 Mean : 98.73
## hatchback :59 3rd Qu.:102.40
## sedan :82 Max. :114.25
## wagon :23
## carlength carwidth carheight curbweight enginetype
## Min. :144.6 Min. :61.80 Min. :47.8 Min. :1713 ohc :132
## 1st Qu.:166.3 1st Qu.:64.00 1st Qu.:51.6 1st Qu.:2145 ohcf : 13
## Median :173.2 Median :65.50 Median :54.1 Median :2420 dohc : 12
## Mean :174.0 Mean :65.90 Mean :53.7 Mean :2556 ohcv : 12
## 3rd Qu.:183.5 3rd Qu.:66.90 3rd Qu.:55.5 3rd Qu.:2952 l : 11
## Max. :202.6 Max. :71.25 Max. :59.8 Max. :4066 rotor : 4
## (Other): 1
## cylindernumber enginesize fuelsystem boreratio stroke
## : 0 Min. : 25.0 mpfi :85 Min. :2.540 Min. :2.660
## eight : 4 1st Qu.: 97.0 2bbl :59 1st Qu.:3.150 1st Qu.:3.110
## five : 8 Median :110.0 idi :17 Median :3.310 Median :3.270
## four :145 Mean :121.1 1bbl :11 Mean :3.325 Mean :3.262
## six : 23 3rd Qu.:145.0 spdi : 8 3rd Qu.:3.580 3rd Qu.:3.410
## twelve: 1 Max. :217.0 4bbl : 3 Max. :3.940 Max. :3.860
## two : 4 (Other): 2
## compressionratio horsepower peakrpm citympg
## Min. : 7.400 Min. : 1.00 Min. :4150 Min. :13.00
## 1st Qu.: 8.600 1st Qu.: 70.00 1st Qu.:4800 1st Qu.:19.00
## Median : 9.000 Median : 95.00 Median :5200 Median :24.00
## Mean : 9.114 Mean : 99.11 Mean :5142 Mean :25.14
## 3rd Qu.: 9.400 3rd Qu.:116.00 3rd Qu.:5500 3rd Qu.:30.00
## Max. :10.600 Max. :185.00 Max. :6550 Max. :46.50
##
## highwaympg price
## Min. :16.00 Min. :-1000
## 1st Qu.:25.00 1st Qu.: 7463
## Median :30.00 Median : 9995
## Mean :30.58 Mean :12277
## 3rd Qu.:34.00 3rd Qu.:16500
## Max. :47.50 Max. :30056
##
# --- Sub-step 6.2: Feature Engineering ---
cat("\n---- Engineering Features ----\n")
##
## ---- Engineering Features ----
# 1. **Interaction Features: Horsepower-to-Weight Ratio**
cat("\nCalculating Horsepower-to-Weight Ratio...\n")
##
## Calculating Horsepower-to-Weight Ratio...
if ("horsepower" %in% names(data) && "curbweight" %in% names(data)) {
data$hp_to_weight <- data$horsepower / data$curbweight
cat("Horsepower-to-weight ratio created.\n")
} else {
cat("Skipped horsepower-to-weight ratio due to missing 'horsepower' or 'curbweight'.\n")
}
## Horsepower-to-weight ratio created.
# 2. **Vehicle Segment Classification (Luxury, Standard, Economy)**
cat("\nClassifying Vehicle Segments...\n")
##
## Classifying Vehicle Segments...
if ("price" %in% names(data)) {
quantiles <- quantile(data$price, probs = c(0.33, 0.66, 1.0), na.rm = TRUE)
data$segment <- cut(
data$price,
breaks = c(-Inf, quantiles[1], quantiles[2], quantiles[3]),
labels = c("Economy", "Standard", "Luxury"),
include.lowest = TRUE
)
cat("Vehicle segments classified into Economy, Standard, and Luxury.\n")
} else {
cat("Skipped vehicle segment classification due to missing 'price'.\n")
}
## Vehicle segments classified into Economy, Standard, and Luxury.
# 3. **Fuel Economy Score**
cat("\nCalculating Fuel Economy Score...\n")
##
## Calculating Fuel Economy Score...
if ("citympg" %in% names(data) && "highwaympg" %in% names(data)) {
data$fuel_economy <- (data$citympg + data$highwaympg) / 2
cat("Fuel economy score created.\n")
} else {
cat("Skipped fuel economy score due to missing 'citympg' or 'highwaympg'.\n")
}
## Fuel economy score created.
# 4. **Brand Value**
cat("\nCreating Brand Value Feature...\n")
##
## Creating Brand Value Feature...
if ("CarName" %in% names(data)) {
data <- data %>%
mutate(CarBrand = tolower(gsub(" .*", "", CarName))) # Extract brand name from CarName
# Standardize brand names
brand_corrections <- c("maxda" = "mazda", "vw" = "volkswagen", "vokswagen" = "volkswagen",
"porcshce" = "porsche", "toyouta" = "toyota")
data$CarBrand <- recode(data$CarBrand, !!!brand_corrections)
# Calculate average price by brand
brand_avg_price <- data %>%
group_by(CarBrand) %>%
summarise(BrandAvgPrice = mean(price, na.rm = TRUE), .groups = "drop")
data <- data %>%
left_join(brand_avg_price, by = "CarBrand")
cat("Brand value feature created.\n")
} else {
cat("Skipped brand value due to missing 'CarName'.\n")
}
## Brand value feature created.
# --- Sub-step 6.3: Visualizing Features ---
cat("\n---- Visualizing New Features ----\n")
##
## ---- Visualizing New Features ----
library(ggplot2)
# 1. Visualize Horsepower-to-Weight Ratio
if ("hp_to_weight" %in% names(data)) {
ggplot(data, aes(x = hp_to_weight, y = price)) +
geom_point(color = "firebrick", alpha = 0.7) +
labs(
title = "Horsepower-to-Weight Ratio vs Price",
x = "Horsepower-to-Weight Ratio",
y = "Price"
) +
theme_minimal() +
theme(plot.title = element_text(color = "darkred", size = 14)) %>%
print()
}
## List of 1
## $ plot.title:List of 11
## ..$ family : NULL
## ..$ face : NULL
## ..$ colour : chr "darkred"
## ..$ size : num 14
## ..$ hjust : NULL
## ..$ vjust : NULL
## ..$ angle : NULL
## ..$ lineheight : NULL
## ..$ margin : NULL
## ..$ debug : NULL
## ..$ inherit.blank: logi FALSE
## ..- attr(*, "class")= chr [1:2] "element_text" "element"
## - attr(*, "class")= chr [1:2] "theme" "gg"
## - attr(*, "complete")= logi FALSE
## - attr(*, "validate")= logi TRUE

# 2. Visualize Vehicle Segment Distribution
if ("segment" %in% names(data)) {
ggplot(data, aes(x = segment, fill = segment)) +
geom_bar(color = "black", fill = c("#66c2a5", "#fc8d62", "#8da0cb")) +
labs(
title = "Vehicle Segment Distribution",
x = "Segment",
y = "Count"
) +
theme_minimal() +
theme(
plot.title = element_text(color = "darkblue", size = 14),
legend.position = "none"
) %>%
print()
}
## List of 2
## $ legend.position: chr "none"
## $ plot.title :List of 11
## ..$ family : NULL
## ..$ face : NULL
## ..$ colour : chr "darkblue"
## ..$ size : num 14
## ..$ hjust : NULL
## ..$ vjust : NULL
## ..$ angle : NULL
## ..$ lineheight : NULL
## ..$ margin : NULL
## ..$ debug : NULL
## ..$ inherit.blank: logi FALSE
## ..- attr(*, "class")= chr [1:2] "element_text" "element"
## - attr(*, "class")= chr [1:2] "theme" "gg"
## - attr(*, "complete")= logi FALSE
## - attr(*, "validate")= logi TRUE

# 3. Visualize Fuel Economy Score
if ("fuel_economy" %in% names(data)) {
ggplot(data, aes(x = fuel_economy, y = price)) +
geom_point(color = "darkgreen", alpha = 0.7) +
labs(
title = "Fuel Economy Score vs Price",
x = "Fuel Economy Score",
y = "Price"
) +
theme_minimal() +
theme(plot.title = element_text(color = "forestgreen", size = 14)) %>%
print()
}
## List of 1
## $ plot.title:List of 11
## ..$ family : NULL
## ..$ face : NULL
## ..$ colour : chr "forestgreen"
## ..$ size : num 14
## ..$ hjust : NULL
## ..$ vjust : NULL
## ..$ angle : NULL
## ..$ lineheight : NULL
## ..$ margin : NULL
## ..$ debug : NULL
## ..$ inherit.blank: logi FALSE
## ..- attr(*, "class")= chr [1:2] "element_text" "element"
## - attr(*, "class")= chr [1:2] "theme" "gg"
## - attr(*, "complete")= logi FALSE
## - attr(*, "validate")= logi TRUE

# 4. Visualize Brand Value (Average Price by Brand)
if ("BrandAvgPrice" %in% names(data)) {
ggplot(data, aes(x = reorder(CarBrand, price, median), y = price, fill = CarBrand)) +
geom_boxplot(outlier.colour = "red", outlier.shape = 16, outlier.size = 2, alpha = 0.7) + # Boxplot with outliers highlighted
labs(
title = "Price Distribution by Car Brand",
x = "Car Brand",
y = "Price"
) +
theme_minimal() +
theme(
plot.title = element_text(color = "darkblue", size = 14),
axis.text.x = element_text(angle = 45, hjust = 1)
) +
scale_fill_manual(values = rainbow(length(unique(data$CarBrand)))) # Custom rainbow palette for brands
} else {
cat("BrandAvgPrice or CarBrand is not available for visualization.\n")
}

# --- Reflection on Step 6: Feature Engineering ---
cat("\n==== Reflection on Step 6: Feature Engineering ====\n")
##
## ==== Reflection on Step 6: Feature Engineering ====
cat(
"1. Horsepower-to-Weight Ratio:\n",
" - Captures the relationship between engine power and vehicle weight.\n",
" - Indicates vehicle performance, relevant for pricing decisions.\n",
"2. Vehicle Segments:\n",
" - Classified cars into Economy, Standard, and Luxury based on price quantiles.\n",
" - Adds domain knowledge about market positioning of vehicles.\n",
"3. Fuel Economy Score:\n",
" - Combines city and highway MPG into a single efficiency metric.\n",
" - Provides insights into fuel consumption trends.\n",
"4. Brand Value:\n",
" - Created 'BrandAvgPrice' to reflect consumer perception and reliability for each brand.\n",
"These features ensure linear relationships, interpretability, and improved model performance for linear regression.\n"
)
## 1. Horsepower-to-Weight Ratio:
## - Captures the relationship between engine power and vehicle weight.
## - Indicates vehicle performance, relevant for pricing decisions.
## 2. Vehicle Segments:
## - Classified cars into Economy, Standard, and Luxury based on price quantiles.
## - Adds domain knowledge about market positioning of vehicles.
## 3. Fuel Economy Score:
## - Combines city and highway MPG into a single efficiency metric.
## - Provides insights into fuel consumption trends.
## 4. Brand Value:
## - Created 'BrandAvgPrice' to reflect consumer perception and reliability for each brand.
## These features ensure linear relationships, interpretability, and improved model performance for linear regression.
# === Step 7: Visualizing Features Against Price ===
cat("\n==== Step 7: Visualizing Features Against Price ====\n")
##
## ==== Step 7: Visualizing Features Against Price ====
library(ggplot2)
library(dplyr)
library(scales)
# --- Sub-step 7.1: Boxplots for Categorical Variables Against Price ---
cat("\n---- Boxplots for Categorical Variables Against Price ----\n")
##
## ---- Boxplots for Categorical Variables Against Price ----
# Identify categorical columns
categorical_cols <- names(data)[sapply(data, is.character)]
cat("Categorical Columns Identified:\n")
## Categorical Columns Identified:
print(categorical_cols)
## [1] "CarName" "CarBrand"
# Clean data for categorical boxplots
data_cleaned <- data %>%
filter(!is.na(price)) %>%
filter(!apply(., 1, function(row) any(is.na(row) | row == "")))
if (nrow(data_cleaned) == 0) {
stop("Error: The dataset 'data_cleaned' is empty after removing null or empty values.")
}
# Generate boxplots for categorical variables
for (col in categorical_cols) {
if (col == "CarName") {
# Handle 'CarName': Visualize top 10 most frequent car names
cat(sprintf("\nColumn '%s' has many categories. Visualizing top 10 most frequent.\n", col))
# Get the top 10 most frequent CarNames
top_cars <- data_cleaned %>%
count(CarName) %>%
arrange(desc(n)) %>%
slice(1:10)
# Filter data to include only top 10 CarNames
filtered_data <- data_cleaned %>% filter(CarName %in% top_cars$CarName)
# Create the boxplot
plot <- ggplot(filtered_data, aes(x = reorder(CarName, price, FUN = median), y = price, fill = CarName)) +
geom_boxplot(outlier.colour = "gold", outlier.shape = 16, outlier.size = 3, alpha = 0.8) +
labs(
title = "Price Distribution by Top 10 Most Frequent Car Names",
x = "Car Name",
y = "Price"
) +
theme_minimal() +
theme(
plot.title = element_text(hjust = 0.5, face = "bold", size = 16, color = "purple"),
axis.text.x = element_text(angle = 45, hjust = 1, color = "darkblue"),
axis.text.y = element_text(color = "darkgreen")
) +
scale_y_continuous(labels = scales::comma) +
scale_fill_manual(values = grDevices::rainbow(10)) # Dynamic palette for top 10 categories
# Print the plot
print(plot)
} else {
# Standard boxplots for other categorical variables
num_colors <- length(unique(data_cleaned[[col]]))
plot <- ggplot(data_cleaned, aes_string(x = col, y = "price", fill = col)) +
geom_boxplot(outlier.colour = "gold", outlier.shape = 16, outlier.size = 3, alpha = 0.8) +
labs(
title = paste("Price Distribution by", col),
x = col,
y = "Price"
) +
theme_minimal() +
theme(
plot.title = element_text(hjust = 0.5, face = "bold", size = 16, color = "purple"),
axis.text.x = element_text(angle = 45, hjust = 1, color = "darkblue"),
axis.text.y = element_text(color = "darkgreen")
) +
scale_y_continuous(labels = scales::comma) +
scale_fill_manual(values = grDevices::rainbow(num_colors)) # Dynamic palette for all categories
# Print the plot
print(plot)
}
}
##
## Column 'CarName' has many categories. Visualizing top 10 most frequent.


cat("\n---- Why Boxplots for Categorical Variables? ----\n")
##
## ---- Why Boxplots for Categorical Variables? ----
cat(
"Boxplots help visualize price distributions across categories, highlighting medians, variability, and outliers.
For 'CarName', only the top 10 most frequent categories are visualized for better interpretability.\n"
)
## Boxplots help visualize price distributions across categories, highlighting medians, variability, and outliers.
## For 'CarName', only the top 10 most frequent categories are visualized for better interpretability.
# --- Sub-step 7.2: Scatterplots for Numerical Variables Against Price ---
cat("\n---- Scatterplots for Numerical Variables Against Price ----\n")
##
## ---- Scatterplots for Numerical Variables Against Price ----
# Identify numerical columns
numeric_cols <- names(data_cleaned)[sapply(data_cleaned, is.numeric)]
cat("Numerical Columns Identified:\n")
## Numerical Columns Identified:
print(numeric_cols)
## [1] "symboling" "wheelbase" "carlength" "carwidth"
## [5] "carheight" "curbweight" "enginesize" "boreratio"
## [9] "stroke" "compressionratio" "horsepower" "peakrpm"
## [13] "citympg" "highwaympg" "price" "hp_to_weight"
## [17] "fuel_economy" "BrandAvgPrice"
# Generate scatterplots for numerical variables
for (col in numeric_cols) {
if (col != "price") {
cat(sprintf("\nCreating scatterplot for '%s' vs 'price'.\n", col))
# Dynamically generate a large enough color palette for CarName
num_colors <- length(unique(data_cleaned$CarName))
color_palette <- grDevices::rainbow(num_colors)
plot <- ggplot(data_cleaned, aes_string(x = col, y = "price")) +
geom_point(aes(color = CarName), size = 2.5, alpha = 0.6) + # Adjusted size and transparency for points
geom_smooth(method = "lm", color = "red", se = FALSE, linewidth = 1) + # Trendline for correlation
labs(
title = paste("Relationship Between", col, "and Price"),
x = col,
y = "Price"
) +
theme_minimal() +
theme(
plot.title = element_text(hjust = 0.5, face = "bold", size = 16, color = "darkred"),
axis.text.x = element_text(size = 10, color = "darkblue"),
axis.text.y = element_text(size = 10, color = "darkgreen"),
legend.position = "none" # Remove legend for simplicity
) +
scale_y_continuous(labels = scales::comma) +
scale_x_continuous(labels = scales::comma) +
scale_color_manual(values = color_palette) # Apply dynamic color palette
# Print the plot
print(plot)
}
}
##
## Creating scatterplot for 'symboling' vs 'price'.
## `geom_smooth()` using formula = 'y ~ x'

##
## Creating scatterplot for 'wheelbase' vs 'price'.
## `geom_smooth()` using formula = 'y ~ x'

##
## Creating scatterplot for 'carlength' vs 'price'.
## `geom_smooth()` using formula = 'y ~ x'

##
## Creating scatterplot for 'carwidth' vs 'price'.
## `geom_smooth()` using formula = 'y ~ x'

##
## Creating scatterplot for 'carheight' vs 'price'.
## `geom_smooth()` using formula = 'y ~ x'

##
## Creating scatterplot for 'curbweight' vs 'price'.
## `geom_smooth()` using formula = 'y ~ x'

##
## Creating scatterplot for 'enginesize' vs 'price'.
## `geom_smooth()` using formula = 'y ~ x'

##
## Creating scatterplot for 'boreratio' vs 'price'.
## `geom_smooth()` using formula = 'y ~ x'

##
## Creating scatterplot for 'stroke' vs 'price'.
## `geom_smooth()` using formula = 'y ~ x'

##
## Creating scatterplot for 'compressionratio' vs 'price'.
## `geom_smooth()` using formula = 'y ~ x'

##
## Creating scatterplot for 'horsepower' vs 'price'.
## `geom_smooth()` using formula = 'y ~ x'

##
## Creating scatterplot for 'peakrpm' vs 'price'.
## `geom_smooth()` using formula = 'y ~ x'

##
## Creating scatterplot for 'citympg' vs 'price'.
## `geom_smooth()` using formula = 'y ~ x'

##
## Creating scatterplot for 'highwaympg' vs 'price'.
## `geom_smooth()` using formula = 'y ~ x'

##
## Creating scatterplot for 'hp_to_weight' vs 'price'.
## `geom_smooth()` using formula = 'y ~ x'

##
## Creating scatterplot for 'fuel_economy' vs 'price'.
## `geom_smooth()` using formula = 'y ~ x'

##
## Creating scatterplot for 'BrandAvgPrice' vs 'price'.
## `geom_smooth()` using formula = 'y ~ x'

cat("\n---- Why Scatterplots for Numerical Variables? ----\n")
##
## ---- Why Scatterplots for Numerical Variables? ----
cat(
"Scatterplots reveal relationships, trends, and correlations between numerical variables and price.
Including a linear trendline helps identify significant predictors for car pricing.\n"
)
## Scatterplots reveal relationships, trends, and correlations between numerical variables and price.
## Including a linear trendline helps identify significant predictors for car pricing.
# === Step 8: Redundant Features and Feature Selection ===
cat("\n==== Step 8: Redundant Features and Feature Selection ====\n")
##
## ==== Step 8: Redundant Features and Feature Selection ====
library(caret) # For findCorrelation function
library(ggplot2) # For heatmap visualization
library(dplyr) # For data manipulation
library(reshape2) # For reshaping correlation matrix
# --- Sub-step 8.1: Check and Remove Redundant Features ---
cat("\n---- Checking and Removing Redundant Features ----\n")
##
## ---- Checking and Removing Redundant Features ----
# Identify and remove explicitly redundant columns
redundant_cols <- c("Car_ID") # Replace with known redundant column names
if (all(redundant_cols %in% names(data))) {
data <- data %>% select(-all_of(redundant_cols))
cat("Removed redundant features:\n")
print(redundant_cols)
} else {
cat("No specified redundant features found.\n")
}
## No specified redundant features found.
# --- Sub-step 8.2: Feature Selection Based on Correlation Matrix ---
cat("\n---- Feature Selection Based on Correlation Matrix ----\n")
##
## ---- Feature Selection Based on Correlation Matrix ----
# Identify numeric columns
numeric_cols <- names(data)[sapply(data, is.numeric)]
cat("Numeric Columns Identified:\n")
## Numeric Columns Identified:
print(numeric_cols)
## [1] "symboling" "wheelbase" "carlength" "carwidth"
## [5] "carheight" "curbweight" "enginesize" "boreratio"
## [9] "stroke" "compressionratio" "horsepower" "peakrpm"
## [13] "citympg" "highwaympg" "price" "hp_to_weight"
## [17] "fuel_economy" "BrandAvgPrice"
# Ensure there are numeric columns
if (length(numeric_cols) == 0) {
stop("Error: No numeric columns found in the dataset.")
}
# Generate the correlation matrix for numeric columns
correlation_matrix <- cor(data[numeric_cols], use = "complete.obs")
# Reshape the correlation matrix for visualization
correlation_melted <- melt(correlation_matrix)
## Warning in melt.default(correlation_matrix): The melt generic in data.table has
## been passed a matrix and will attempt to redirect to the relevant reshape2
## method; please note that reshape2 is superseded and is no longer actively
## developed, and this redirection is now deprecated. To continue using melt
## methods from reshape2 while both libraries are attached, e.g. melt.list, you
## can prepend the namespace, i.e. reshape2::melt(correlation_matrix). In the next
## version, this warning will become an error.
colnames(correlation_melted) <- c("Feature1", "Feature2", "Correlation")
# Create a heatmap of the correlation matrix
heatmap_plot <- ggplot(correlation_melted, aes(x = Feature1, y = Feature2, fill = Correlation)) +
geom_tile(color = "white") + # Add gridlines to tiles
scale_fill_gradient2(
low = "blue", high = "red", mid = "white", midpoint = 0,
limit = c(-1, 1), name = "Correlation" # Define color scale limits and legend title
) +
theme_minimal() +
theme(
axis.text.x = element_text(angle = 45, hjust = 1, size = 10, color = "darkblue"),
axis.text.y = element_text(size = 10, color = "darkblue"),
plot.title = element_text(size = 16, face = "bold", hjust = 0.5, color = "darkred")
) +
labs(
title = "Correlation Matrix of Numeric Features",
x = "",
y = ""
)
# Print the heatmap
print(heatmap_plot)

# Find highly correlated features (correlation > 0.9)
highly_correlated <- findCorrelation(correlation_matrix, cutoff = 0.9, names = TRUE)
# Remove highly correlated features
if (length(highly_correlated) > 0) {
cat("Highly correlated features identified and removed:\n")
print(highly_correlated)
data <- data %>% select(-all_of(highly_correlated))
} else {
cat("No highly correlated features found.\n")
}
## Highly correlated features identified and removed:
## [1] "fuel_economy" "highwaympg"
# --- Reflection on Step 8 ---
cat("\n==== Reflection on Step 8 ====\n")
##
## ==== Reflection on Step 8 ====
cat(
"1. Redundant Features:\n",
" - Explicitly removed columns like 'Car_ID' which do not contribute to predictive modeling.\n\n",
"2. Correlation Matrix:\n",
" - Created a heatmap to visualize correlations among numeric features.\n",
" - Identified features with high correlation (cutoff: >0.9) and removed them to reduce multicollinearity.\n\n",
"By performing this step, the dataset is now more refined and ready for modeling.\n"
)
## 1. Redundant Features:
## - Explicitly removed columns like 'Car_ID' which do not contribute to predictive modeling.
##
## 2. Correlation Matrix:
## - Created a heatmap to visualize correlations among numeric features.
## - Identified features with high correlation (cutoff: >0.9) and removed them to reduce multicollinearity.
##
## By performing this step, the dataset is now more refined and ready for modeling.
# === Step 9: Export Final Dataset ===
cat("\n==== Step 9: Export Final Dataset ====\n")
##
## ==== Step 9: Export Final Dataset ====
# Define the file name for the final dataset
output_file <- "car_prices_preprocessed.csv" # Descriptive name based on the context
# Export the dataset
tryCatch(
{
write.csv(data, output_file, row.names = FALSE)
cat(sprintf("Final dataset successfully exported to '%s'.\n", output_file))
},
error = function(e) {
cat(sprintf("Error exporting the final dataset: %s\n", e$message))
}
)
## Final dataset successfully exported to 'car_prices_preprocessed.csv'.
# --- Reflection on Step 9 ---
cat("\n==== Reflection on Step 9 ====\n")
##
## ==== Reflection on Step 9 ====
cat(
"1. The cleaned and preprocessed dataset has been saved as a CSV file named 'car_prices_preprocessed.csv'.\n",
"2. The file name reflects the dataset's purpose, making it easier to identify and reuse.\n",
"3. This ensures consistency and reproducibility, providing a high-quality dataset ready for modeling or analysis.\n"
)
## 1. The cleaned and preprocessed dataset has been saved as a CSV file named 'car_prices_preprocessed.csv'.
## 2. The file name reflects the dataset's purpose, making it easier to identify and reuse.
## 3. This ensures consistency and reproducibility, providing a high-quality dataset ready for modeling or analysis.